import pandas as pd
import numpy as np

from sklearn.datasets import fetch_california_housing

from evidently import ColumnMapping

from evidently.report import Report
from evidently.metrics.base_metric import generate_column_metrics
from evidently.metric_preset import DataDriftPreset, TargetDriftPreset, DataQualityPreset, RegressionPreset
from evidently.metrics import *

from evidently.test_suite import TestSuite
from evidently.tests.base_test import generate_column_tests
from evidently.test_preset import DataStabilityTestPreset, NoTargetPerformanceTestPreset, RegressionTestPreset
from evidently.tests import *
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
%load_ext kedro.ipython
catalog
dataset = catalog.load("dataset_id_742")
dataset.head()
[09/25/24 19:12:02] INFO     Loading data from dataset_id_742 (MlflowCSVDataset)...             data_catalog.py:539

checking_status duration credit_history purpose credit_amount savings_status employment installment_commitment personal_status other_parties ... X_3 X_4 X_5 X_6 X_7 X_8 X_9 X_10 X_11 y
0 0<=X<200 30 all paid furniture/equipment 3496.0 >=1000 1<=X<4 4 male single none ... 0.449680 4.055515 0.001283 0.191197 0.449680 0.305551 0.750874 0.765611 0.623008 False
1 no checking 42 existing paid radio/tv 7166.0 no known savings 4<=X<7 2 male mar/wid none ... 0.623008 4.962082 0.396208 0.525043 0.378336 0.887402 0.342156 0.938944 0.266451 True
2 no checking 18 existing paid furniture/equipment 1984.0 <100 1<=X<4 4 male single none ... 0.887402 4.421564 0.818652 0.508065 0.898812 0.255605 0.319191 0.535321 0.444986 True
3 no checking 48 critical/other existing credit radio/tv 3578.0 no known savings >=7 4 male single none ... 0.818652 6.080646 0.445092 0.974743 0.433125 0.009822 0.782929 0.231668 0.204295 True
4 0<=X<200 30 existing paid furniture/equipment 3441.0 100<=X<500 1<=X<4 2 female div/dep/mar co applicant ... 0.319191 6.353212 0.078052 0.104358 0.348942 0.513697 0.839659 0.362052 0.639795 False

5 rows × 33 columns

df = catalog.load("preprocessed_dataset")
df.head()
[09/25/24 19:14:56] INFO     Loading data from preprocessed_dataset (MlflowParquetDataset)...   data_catalog.py:539

duration credit_amount installment_commitment residence_since age existing_credits num_dependents X_1 X_2 X_3 ... job_high qualif/self emp/mgmt job_skilled job_unemp/unskilled non res job_unskilled resident own_telephone_none own_telephone_yes foreign_worker_no foreign_worker_yes health_status_bad health_status_good
0 0.616420 -0.058349 0.874126 -0.751108 -0.295901 -0.692308 2.064742 -1.703735 -1.129340 -0.302667 ... False True False False False True False True False True
1 1.585887 1.013586 -0.985716 1.058791 -0.723504 -0.692308 -0.484322 0.753875 0.840695 0.260109 ... False True False False False True False True False True
2 -0.353048 -0.499975 0.874126 1.058791 0.815866 1.230769 -0.484322 0.013467 -0.487520 1.118564 ... False True False False True False False True False True
3 2.070621 -0.034398 0.874126 -1.656058 0.815866 -0.692308 -0.484322 1.370483 -0.871247 0.895340 ... False True False False False True False True False True
4 0.616420 -0.074413 -0.985716 1.058791 -1.407669 -0.692308 -0.484322 1.238906 -0.908444 -0.726348 ... False True False False True False False True True False

5 rows × 73 columns

data_stability= TestSuite(tests=[
    DataStabilityTestPreset(),
])
data_stability.run(current_data=dataset.iloc[:40], reference_data=dataset.iloc[40:], column_mapping=None)
data_stability 
data_drift_report = Report(metrics=[
    DataDriftPreset(),
])

data_drift_report.run(current_data=dataset.iloc[:60], reference_data=dataset.iloc[60:], column_mapping=None)
data_drift_report